In [ ]:
import pandas as pd
import seaborn as sns
import numpy as np
import os
import sys
import warnings

import plotly.io as pio
import plotly.express as px
import scipy.cluster.hierarchy as hierarchy

from collections import namedtuple
from matplotlib import pyplot as plt
from dotenv import load_dotenv
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import MinMaxScaler

sys.path.append("../")

from scripts.order_operations import get_min_max_dt, get_order_details  # Shows error but its ok, trust me
from scripts.optimizers_mp import k_means_optimizer # Same
from scripts.optimizers import dbscan_optimizer  # Same Same

data_path = "../data"

pio.renderers.default = "notebook_connected"

load_dotenv()
sns.color_palette('colorblind')
plt.style.use('Solarize_Light2')

# Setting default DPI, pulling it from dotenv if it exists, setting it on 100 if not

try:
    pc_dpi = int(os.getenv('DPI'))
except TypeError:
    pc_dpi = 100
if pc_dpi is None:
    pc_dpi = 100

1 : Introduction¶

2 : DataFrame creation, RFM logic¶

3 : Early visualisations¶

4 : Applying classification algorithms (K-Means, DBSCAN, Agglomerative clustering)¶

5 : Conclusions and potential improvements¶

1 : Introduction :¶

 The attempt of this notebook is focused on the widespread approach : RFM (Recency Frequency Monetary). Usual classifications using traditional (and fundamentally flawed) approaches will likely be attempted, like the Paretto distribution.

 Using RMF approach, we will first attempt to distinguish classes with the data AS IS - This is expected to produce poor results. We will then evolve towards classifying machine learning algorithms which use more resources but produce, in general, better results.

 It is expected that the data provided by this approach will be insufficient but will provide a good raw material to improve the models by improving the data quality and the algorithms.

2 : DataFrame creation, RFM logic¶

 The usage of RFM presupposes that we have the data to calculate the 3 indicators (R F & M). Not all datasets will be required for this step and there is no need to clog up the memory as the calculation time/resource is expected to go up.
 Will be required :

  • Olist_customers : will provide the link between the unique customer and it's aliases (and orders) - The goal is to eliminate the useless repetitivity of the alias.
  • Olist_orders : will provide the link between the customer (referred sometimes as cx.) and the order itself - The date the order was placed and which alias placed the order.
  • Olist_order_items : will provide what items were ordered at what price, determining the monetary component of each order

 The Recency will be defined by the time separating the most recent update from a Cx and the most recent update known - In this approach, we assume that the latest order placed is the most recent order (instead of using t0 = today).
 The Frequency will be : number_of_orders/membership_time. membership_time is : time elapsed between first order of account and last general order(not last order of account).
 Monetary will be the total of all item prices of all orders placed by a customer, Kaggle shows that freight is always paid (case where Cx orders 10 times the same item from the same seller --> Cx will pay 10 times the freight price).

In [ ]:
olist_customers_file = "../data/optimized/olist_customers.csv"
olist_orders_file = "../data/optimized/olist_orders.csv"
olist_order_items_file = "../data/optimized/olist_order_items.csv"
In [ ]:
df_customers = pd.read_csv(filepath_or_buffer=olist_customers_file)
df_orders = pd.read_csv(filepath_or_buffer=olist_orders_file)
df_orders_items = pd.read_csv(filepath_or_buffer=olist_order_items_file)
In [ ]:
# Dtypes were not carried over and will need to be enforced
# Or using pickles rather than CSV

df_customers["customer_id"] = df_customers["customer_id"].astype(np.uint32)
df_customers["customer_unique_id"] = df_customers["customer_unique_id"].astype(np.uint32)
In [ ]:
df_customers.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99441 entries, 0 to 99440
Data columns (total 5 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   customer_id               99441 non-null  uint32
 1   customer_unique_id        99441 non-null  uint32
 2   customer_zip_code_prefix  99441 non-null  int64 
 3   customer_city             99441 non-null  object
 4   customer_state            99441 non-null  object
dtypes: int64(1), object(2), uint32(2)
memory usage: 3.0+ MB
In [ ]:
df_customers.head()
Out[ ]:
customer_id customer_unique_id customer_zip_code_prefix customer_city customer_state
0 1 1 14409 franca SP
1 2 2 9790 sao bernardo do campo SP
2 3 3 1151 sao paulo SP
3 4 4 8775 mogi das cruzes SP
4 5 5 13056 campinas SP
In [ ]:
date_cols = [
        "order_purchase_dt", "order_approved_at",
        "order_delivered_carrier_date", "order_delivered_customer_date",
        "order_estimated_delivery_date"
    ]

for col in date_cols:
    df_orders[col] = pd.to_datetime(df_orders[col])

df_orders["order_id"] = df_orders["order_id"].astype(np.uint32)
df_orders["customer_id"] = df_orders["customer_id"].astype(np.uint32)
In [ ]:
df_orders.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99441 entries, 0 to 99440
Data columns (total 8 columns):
 #   Column                         Non-Null Count  Dtype         
---  ------                         --------------  -----         
 0   order_id                       99441 non-null  uint32        
 1   customer_id                    99441 non-null  uint32        
 2   order_status                   99441 non-null  object        
 3   order_purchase_dt              99441 non-null  datetime64[ns]
 4   order_approved_at              99281 non-null  datetime64[ns]
 5   order_delivered_carrier_date   97658 non-null  datetime64[ns]
 6   order_delivered_customer_date  96476 non-null  datetime64[ns]
 7   order_estimated_delivery_date  99441 non-null  datetime64[ns]
dtypes: datetime64[ns](5), object(1), uint32(2)
memory usage: 5.3+ MB
In [ ]:
df_orders.head()
Out[ ]:
order_id customer_id order_status order_purchase_dt order_approved_at order_delivered_carrier_date order_delivered_customer_date order_estimated_delivery_date
0 1 70297 delivered 2017-10-02 10:56:33 2017-10-02 11:07:15 2017-10-04 19:55:00 2017-10-10 21:25:13 2017-10-18
1 2 77028 delivered 2018-07-24 20:41:37 2018-07-26 03:24:27 2018-07-26 14:31:00 2018-08-07 15:27:45 2018-08-13
2 3 555 delivered 2018-08-08 08:38:49 2018-08-08 08:55:23 2018-08-08 13:50:00 2018-08-17 18:06:29 2018-09-04
3 4 61082 delivered 2017-11-18 19:28:06 2017-11-18 19:45:59 2017-11-22 13:39:59 2017-12-02 00:28:42 2017-12-15
4 5 67264 delivered 2018-02-13 21:18:39 2018-02-13 22:20:29 2018-02-14 19:46:34 2018-02-16 18:17:02 2018-02-26
In [ ]:
df_orders_items["order_id"] = df_orders_items["order_id"].astype(np.uint32)
df_orders_items["order_item_id"] = df_orders_items["order_item_id"].astype(np.uint32)
df_orders_items["product_id"] = df_orders_items["product_id"].astype(np.uint32)
df_orders_items["seller_id"] = df_orders_items["seller_id"].astype(np.uint32)

df_orders_items["shipping_limit_date"] = pd.to_datetime(df_orders_items["shipping_limit_date"])
In [ ]:
df_orders_items.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 112650 entries, 0 to 112649
Data columns (total 7 columns):
 #   Column               Non-Null Count   Dtype         
---  ------               --------------   -----         
 0   order_id             112650 non-null  uint32        
 1   order_item_id        112650 non-null  uint32        
 2   product_id           112650 non-null  uint32        
 3   seller_id            112650 non-null  uint32        
 4   shipping_limit_date  112650 non-null  datetime64[ns]
 5   price                112650 non-null  float64       
 6   freight_value        112650 non-null  float64       
dtypes: datetime64[ns](1), float64(2), uint32(4)
memory usage: 4.3 MB
In [ ]:
df_orders_items.head()
Out[ ]:
order_id order_item_id product_id seller_id shipping_limit_date price freight_value
0 85268 1 25866 514 2017-09-19 09:45:35 58.90 13.29
1 71854 1 27231 472 2017-05-03 11:05:13 239.90 19.93
2 6299 1 22625 1825 2018-01-18 14:48:30 199.00 17.87
3 22551 1 15404 2024 2018-08-15 10:10:18 12.99 12.79
4 5248 1 8863 1598 2017-02-13 13:57:51 199.90 18.14
In [ ]:
# Starting point can be orders df, we will remove infos as we need

df_orders.head()
Out[ ]:
order_id customer_id order_status order_purchase_dt order_approved_at order_delivered_carrier_date order_delivered_customer_date order_estimated_delivery_date
0 1 70297 delivered 2017-10-02 10:56:33 2017-10-02 11:07:15 2017-10-04 19:55:00 2017-10-10 21:25:13 2017-10-18
1 2 77028 delivered 2018-07-24 20:41:37 2018-07-26 03:24:27 2018-07-26 14:31:00 2018-08-07 15:27:45 2018-08-13
2 3 555 delivered 2018-08-08 08:38:49 2018-08-08 08:55:23 2018-08-08 13:50:00 2018-08-17 18:06:29 2018-09-04
3 4 61082 delivered 2017-11-18 19:28:06 2017-11-18 19:45:59 2017-11-22 13:39:59 2017-12-02 00:28:42 2017-12-15
4 5 67264 delivered 2018-02-13 21:18:39 2018-02-13 22:20:29 2018-02-14 19:46:34 2018-02-16 18:17:02 2018-02-26
In [ ]:
# Cx id, order_id, order_purchase_dt are useful, the rest can go

df_orders = df_orders[["order_id", "customer_id", "order_purchase_dt"]]

df_orders["customer_uid"] = np.uint32(0)
df_orders["sum_total"] = np.nan

df_orders.head()
Out[ ]:
order_id customer_id order_purchase_dt customer_uid sum_total
0 1 70297 2017-10-02 10:56:33 0 NaN
1 2 77028 2018-07-24 20:41:37 0 NaN
2 3 555 2018-08-08 08:38:49 0 NaN
3 4 61082 2017-11-18 19:28:06 0 NaN
4 5 67264 2018-02-13 21:18:39 0 NaN
In [ ]:
order_ids = df_orders["order_id"].unique()

for order_id in order_ids:
    index = df_orders.index[df_orders["order_id"] == order_id][0]
    order_value = df_orders_items[df_orders_items["order_id"] == order_id]["price"].sum()
    freight_value = df_orders_items[df_orders_items["order_id"] == order_id]["freight_value"].sum()
    df_orders.at[index, "sum_total"] = order_value + freight_value
In [ ]:
# Setting customers unique ids instead of cx_ids

cx_ids = df_orders["customer_id"]

for cx_id in cx_ids:
    index = df_orders.index[df_orders["customer_id"] == cx_id][0]
    cx_uid = df_customers[df_customers["customer_id"] == cx_id]["customer_unique_id"]
    df_orders.at[index, "customer_uid"] = cx_uid
In [ ]:
df_orders.head()
Out[ ]:
order_id customer_id order_purchase_dt customer_uid sum_total
0 1 70297 2017-10-02 10:56:33 68585 38.71
1 2 77028 2018-07-24 20:41:37 74977 141.46
2 3 555 2018-08-08 08:38:49 555 179.12
3 4 61082 2017-11-18 19:28:06 59790 72.20
4 5 67264 2018-02-13 21:18:39 65715 28.62
In [ ]:
rfm_cols = ["customer_uid", "order_id_list", "most_ancient_order_dt", "most_recent_order_dt", "recency", "frequency", "monetary"]
df_rfm = pd.DataFrame(columns=rfm_cols)
In [ ]:
uniques = df_orders["customer_uid"].unique()

uniques.sort()

df_rfm["customer_uid"] = uniques
In [ ]:
for uid in uniques:
    index = df_rfm.index[df_rfm["customer_uid"] == uid][0]
    details = get_order_details(cx_uid=uid, uid_col="customer_uid", from_frame=df_orders)
    df_rfm.at[index, "order_id_list"] = details["order_list"]
    df_rfm.at[index, "monetary"] = details["total_spent"]
In [ ]:
for tuple in df_rfm.itertuples():
    index = tuple.Index
    order_list = tuple.order_id_list
    min_max_dt = get_min_max_dt(order_list=order_list, from_frame=df_orders, dt_col="order_purchase_dt")
    df_rfm.at[index, "most_ancient_order_dt"] = min_max_dt["min"]
    df_rfm.at[index, "most_recent_order_dt"] = min_max_dt["max"]
In [ ]:
most_recent_global = df_rfm["most_recent_order_dt"].max()
In [ ]:
def get_recency(row, most_recent_global) -> pd.Timedelta:
    """
    Returns timedelta in seconds between most recent purchase global and most recent purchase cx
    """
    return (most_recent_global - row["most_recent_order_dt"]).total_seconds()


def get_frequency(row, most_recent_global):
    """
    Returns avg. purchases made per active month
    """
    account_timespan = most_recent_global - row["most_ancient_order_dt"]
    try:
        return len(row["order_id_list"]) / (account_timespan.total_seconds() / 2419200)  # Purchase per active month
    except ZeroDivisionError:
        return 0


def get_num_order(row):
    return len(row["order_id_list"])
In [ ]:
df_rfm["recency"] = df_rfm.apply(get_recency, axis=1, args=(most_recent_global, ))
In [ ]:
df_rfm["frequency"] = df_rfm.apply(get_frequency, axis=1, args=(most_recent_global, ))
In [ ]:
df_rfm["num_orders"] = df_rfm.apply(get_num_order, axis=1)
In [ ]:
df_rfm.head()
Out[ ]:
customer_uid order_id_list most_ancient_order_dt most_recent_order_dt recency frequency monetary num_orders
0 1 [88493] 2017-05-16 15:05:35 2017-05-16 15:05:35 44850283.0 0.053939 146.87 1
1 2 [90419] 2018-01-12 20:48:24 2018-01-12 20:48:24 24007314.0 0.100769 335.48 1
2 3 [22558] 2018-05-19 16:07:45 2018-05-19 16:07:45 13051353.0 0.185360 157.73 1
3 4 [32181] 2018-03-13 16:06:38 2018-03-13 16:06:38 18840220.0 0.128406 173.3 1
4 5 [69903] 2018-07-29 09:51:30 2018-07-29 09:51:30 6939528.0 0.348612 252.25 1

Part 2 - conclusion :¶

  • Many cells are not useful and can be safely deleted (all cells displaying dfs and head() / info())
  • DataFrame created containing RFM values :
    • Recency is delta T in seconds between Cx most recent order and Global most recent order
    • Frequency is purchases made / account creation to most recent order | In purchase per month
    • Monetary is total spent on all order, including freight which is -always- paid

3 : Early visualizations¶

Goals :

  • Identify possible clusters using only RFM and scaling (most likely min-max scaler)
  • Apply Pareto Principle (irrelevant) : 20% of customers generate 80% of transactions.
  • Identify possibly lacking variables overlooked by RFM method (to be confirmed by part 4)
  • Plot RFM two by two, then 3D attempt and/or radar

1 : Intuition : Most clients make just one purchase¶

In [ ]:
df_rfm.sort_values("num_orders", ascending=False)
Out[ ]:
customer_uid order_id_list most_ancient_order_dt most_recent_order_dt recency frequency monetary num_orders
14104 14105 [2341, 3661, 5168, 11695, 16232, 19128, 24887,... 2017-05-15 23:30:03 2018-08-20 19:14:26 5004952.0 0.915825 902.04 17
5105 5106 [3006, 14074, 28926, 41710, 46195, 53989, 8032... 2017-09-18 18:53:15 2018-02-27 18:36:39 20040819.0 0.639687 1172.67 9
6995 6996 [8031, 22631, 24841, 53421, 65531, 75391, 83571] 2017-11-13 16:44:41 2018-02-14 13:22:12 21182886.0 0.579827 959.01 7
26099 26100 [5298, 10136, 25332, 27522, 59635, 81816, 86325] 2017-10-09 12:34:39 2018-06-01 11:38:29 11944309.0 0.525180 1122.72 7
16760 16761 [16444, 27469, 31847, 57970, 58380, 61165, 88757] 2017-09-19 01:02:44 2018-06-28 00:43:34 9650804.0 0.497859 758.83 7
... ... ... ... ... ... ... ... ...
32980 32981 [86856] 2018-06-01 11:20:52 2018-06-01 11:20:52 11945366.0 0.202522 133.89 1
32979 32980 [92350] 2017-12-02 12:07:58 2017-12-02 12:07:58 27580940.0 0.087713 112.83 1
32978 32979 [32665] 2017-11-12 20:29:28 2017-11-12 20:29:28 29278850.0 0.082626 47.74 1
32977 32978 [65645] 2017-03-03 01:23:13 2017-03-03 01:23:13 51293225.0 0.047164 142.57 1
96095 96096 [96138] 2017-12-19 14:27:23 2017-12-19 14:27:23 26103775.0 0.092676 21.77 1

96096 rows × 8 columns

In [ ]:
number_orders = df_rfm["num_orders"].value_counts().to_dict()

number_orders
Out[ ]:
{1: 93099, 2: 2745, 3: 203, 4: 30, 5: 8, 6: 6, 7: 3, 9: 1, 17: 1}
In [ ]:
fig, (ax1) = plt.subplots(
    ncols=1,
    nrows=1,
    figsize=(12, 6),
    dpi=pc_dpi,
)

ax1.bar(x=list(number_orders.keys()), height=(number_orders.values()), width=1, color="navy", edgecolor="black")

###
# Titles/Lables
ax1.set_xlabel("Number of orders")
ax1.set_xticks(range(0, max(list(number_orders.keys()))))
ax1.set_ylabel("Number of customers")
fig.suptitle("Number of orders distribution in dataset")
#
###

fig.tight_layout()
plt.show()
In [ ]:
# Excluding 0 : 
fig, (ax1) = plt.subplots(
    ncols=1,
    nrows=1,
    figsize=(12, 6),
    dpi=pc_dpi,
)

try:
    number_orders.pop(0)
except KeyError:
    pass

ax1.bar(x=list(number_orders.keys()), height=number_orders.values(), width=1, color="navy", edgecolor="black")

###
# Titles/Lables
ax1.set_xlabel("Number of orders")
ax1.set_ylabel("Number of customers")
ax1.set_xticks(range(2, max(list(number_orders.keys()))))
fig.suptitle("Number of orders per cx, exclusion of unique orders")
#
###

fig.tight_layout()
plt.show()

Observations :¶

 Clearly, Olist has either problems keeping customers "loyals" or it is easier to create a new account each visit. In any case, the amount of customers who made exactly one purchase is crushingly larger than all other customers combined, regardless of frequency (93099 customers).  Excluding customers who made exactly one purchase, the majority of remaining customers order on Olist between .1 and .4 times a month (between 1.1 and 4.8 times a year). There are extremes ordering up to 1.7 times a month. But these customers are extremely rare.

  • Operating under the assumption that Olist did not provide its whole database dump, early conclusions cannot be drawn.

2 : How much have customers spent ?¶

In [ ]:
fig, (ax1) = plt.subplots(
    ncols=1,
    nrows=1,
    figsize=(12, 5),
    dpi=pc_dpi,
)

flierprops={"marker": "+", "markersize": 2, "markerfacecolor": "navy"}

ax1.boxplot(x="monetary", data=df_rfm, showmeans=True, vert=False, flierprops=flierprops)

###
# Titles/Lables
fig.suptitle("Customer repartition by total spent on Olist")
#
###

fig.tight_layout()
plt.show()
In [ ]:
df_rfm["monetary"].value_counts()
Out[ ]:
0.00      676
77.57     246
35.00     160
73.34     148
116.94    125
         ... 
28.31       1
203.23      1
139.48      1
75.74       1
92.36       1
Name: monetary, Length: 31734, dtype: int64

Lots of zeroes, either purchased is canceled or refunded. - Lets drop those and zoom on 0 -> 1K

In [ ]:
fig, (ax1) = plt.subplots(
    ncols=1,
    nrows=1,
    figsize=(12, 5),
    dpi=pc_dpi,
)

flierprops={"marker": "+", "markersize": 2, "markerfacecolor": "navy"}

ax1.boxplot(x="monetary", data=df_rfm[df_rfm["monetary"] != 0.0], showmeans=True, vert=False, flierprops=flierprops)

###
# Titles/Lables
ax1.set_xlim(0, 1000)
fig.suptitle("Customer repartition by total spent on Olist | Transaction = 0 removed & Zoom on 0-1K")
#
###

fig.tight_layout()
plt.show()
In [ ]:
# For some reason .describe() does not work here
print("Q1 :", df_rfm["monetary"].quantile(.25))
print("Q3 :", df_rfm["monetary"].quantile(.75))
print("Median :", df_rfm["monetary"].median())
print("Avg. :", np.average(df_rfm["monetary"].values))
Q1 : 62.39
Q3 : 182.2375
Median : 107.27000000000001
Avg. : 164.87214077588146
In [ ]:
outliers = df_rfm[df_rfm["monetary"] < 62.39]
outliers = df_rfm[df_rfm["monetary"] > 182.2375]
print("Max :", max(df_rfm["monetary"].values))
print(len(outliers))
del outliers
Max : 13664.08
24024

Observations :¶

 Most clients spend between 62.39 & 182.24 Reales (Assuming it's in Reales and not in Dollars, it's not precised in the dataset and website screenshots show that Reales is used), with the median at 107.28 and an average of 164.88 R$.

 There is a wide range of outliers (24024) with spendings going from +Q3(182.2375 R$) to 13664.08 R$, but, mostly, outliers spent 4K max with a very small number of customers crossing this line.

  • Operating under the assumption that Olist did not provide its whole database dump, early conclusions cannot be drawn.

3 : Time Delta between global last order and customer last order¶

In [ ]:
df_rfm.describe()["recency"]
Out[ ]:
count    9.609600e+04
mean     2.490060e+07
std      1.325517e+07
min      0.000000e+00
25%      1.415973e+07
50%      2.323386e+07
75%      3.433116e+07
max      6.677370e+07
Name: recency, dtype: float64
In [ ]:
fig, (ax1) = plt.subplots(
    ncols=1,
    nrows=1,
    figsize=(12, 6),
    dpi=pc_dpi,
)

ax1.boxplot(x="recency", data=df_rfm, vert=False, showmeans=True)

###
# Titles/Lables
fig.suptitle("Distribution of customers based on their last order time:")
ax1.set_xlabel("Time Delta (seconds)")
#
###

fig.tight_layout()
plt.show()
In [ ]:
# Not very visual so let's convert seconds to days : 
fig, (ax1) = plt.subplots(
    ncols=1,
    nrows=1,
    figsize=(12, 6),
    dpi=pc_dpi,
)

deltas = df_rfm["recency"]
deltas_days = np.divide(deltas, 86400)

ax1.boxplot(x=deltas_days, vert=False, showmeans=True)

###
# Titles/Lables
fig.suptitle("Distribution of customers based on their last order time :")
ax1.set_xlabel("Time Delta (days)")
#
###

fig.tight_layout()
plt.show()
In [ ]:
deltas_days.describe()
Out[ ]:
count    96096.000000
mean       288.201370
std        153.416318
min          0.000000
25%        163.885744
50%        268.910440
75%        397.351398
max        772.843738
Name: recency, dtype: float64

The data that most customers haven't ordered in a while¶

  • 75% of the customers in the database haven't ordered anything in at least the last 164 days (rounded up).
  • Operating under the assumption that Olist did not provide its whole database dump, early conclusions cannot be drawn.
In [ ]:
grid = sns.pairplot((df_rfm[["recency", "frequency", "monetary"]]))

grid.figure.figsize = (4, 4)
grid.figure.dpi = pc_dpi

###
# Titles/Lables
grid.figure.suptitle("Pairplot between RFM variables")
#
###
grid.figure.tight_layout()
plt.show()

Conclusion¶

 There are no obvious clusters distinguishable using simply RFM variables and pairploting. 3D Plotting of RFM can be attempted but it seems the data of the 3 variables will not be sufficient to offer what the customer (Olist) is looking for.  The Wikipedia page regarding Olist mentions that there are 2M+ unique active customers, so it looks like we only have a small fragment of their database : we know we have 96K accounts and it exists 2M+ accounts, but we are not aware of other useful stastistics, like the number of order, which would explain why our frequency is = 0 in close to 99% of the customers.

4 : 3D Visualisation of RFM variables¶

We hope to obtain visual clusters to identify and classify customers using the MK1 eyeball without any clustering algorithms or data engineering

In [ ]:
labels_dict ={
        "recency": "Recency (in seconds)",
        "frequency": "Frequncy of purchase(s)",
        "monetary": "Total Spent on Olist"
    }

marker_style = {
    "color": 'navy',
    "size": 5,
    }

fig = px.scatter_3d(
    data_frame=df_rfm, x="recency",
    y="frequency", z="monetary",
    width=5 * pc_dpi, height=3 * pc_dpi,
    labels=labels_dict,
    )


fig.update_layout(
    margin=dict(l=40, r=40, t=40, b=40),
    title="3D Representation of customers, RFM approach",
)

fig.update_traces(marker=marker_style)

fig.show()

Unclear¶

  • The most recent "order" is not really an order as it sum total is 0. Maybe it was too recent for the payment for being processed, awaiting further directions for actions on total_spent = 0
  • We can clearly see the extremes and what looks like a zone where most Cxs are placed, but the clustering is still unclear.

4 : Applying classification algorithms (K-Means, DBSCAN, Agglomerative clustering)¶

Following the inconclusive results (as expected) of part #3, Machine Learning classification algorithms can be applied to determine if clusters can be found.

4.0 : Min Max Scaling the dataset¶

0 -> 10 or 0 -> 5 will be better suited for understanding data like reviews and ratings

In [ ]:
mms = MinMaxScaler(feature_range=(0, 10))

dropcols = ["order_id_list", "most_ancient_order_dt", "most_recent_order_dt", "num_orders"]

df_rfm_mms = df_rfm.drop(columns=dropcols)

df_rfm_mms.set_index("customer_uid", inplace=True)

keepcols = df_rfm_mms.columns

df_rfm_mms = mms.fit_transform(df_rfm_mms.to_numpy())

df_rfm_mms = pd.DataFrame(df_rfm_mms, columns=keepcols)

df_rfm_mms.head()
Out[ ]:
recency frequency monetary
0 6.716759 0.316991 0.107486
1 3.595325 0.592200 0.245520
2 1.954565 1.089323 0.115434
3 2.821503 0.754616 0.126829
4 1.039261 2.048718 0.184608

4.1 : Using and optimizing k-means clustering¶

Trial and error will be used to determine the best k-range to pass to the optimizer

In [ ]:
k_range = range(2, 9)

k_means_optimizer(data=df_rfm_mms, k_range=k_range)

Observations :¶

  4 and 5 both seem to be the optimal k-numbers of clusters, the error keeps diminishing with the increasing number of clusters but the silhouette score is acceptable, performing k-means with k=4 (k=5 doesn't look as good) and plotting the results

It seems the silhouette score calculation is what makes the optimizer takes it's sweet sweet time, if not needed for comparison, it can be considered to rely solely on inertia

In [ ]:
km = KMeans(n_clusters=4)
y_predicted = km.fit_predict(df_rfm_mms)

df_rfm_mms["cluster_4"] = y_predicted
In [ ]:
labels_dict ={
        "recency": "Recency",
        "frequency": "Frequency of purchase(s)",
        "monetary": "Total Spent on Olist"
    }

marker_style = {
    "size": 5,
    }

fig = px.scatter_3d(
    data_frame=df_rfm_mms, x="recency",
    y="frequency", z="monetary", color="cluster_4",
    width=4 * pc_dpi, height=3 * pc_dpi,
    labels=labels_dict,
    )


fig.update_layout(
    margin=dict(l=40, r=40, t=40, b=40),
    title="3D Representation of customers, RFM approach, k-means clustering, k=4",
)

fig.update_traces(marker=marker_style)

fig.show()

Observation :¶

 Even if the clustering is not crystal clear, we can see a somewhat clear pattern, to be expected. :

  • Cluster 4 (Farthest cluster to origin, cluster# changes) represents dissatisfied people overall, this group's mind might not be possible to change about the goals and ideas of Olist : we can theorize (and calculate later) that this group purchased mainly once, for a moderate sum of Reales, and a long time ago.
  • Cluster 2 and 3 (Center groups, cluster# changes) represents people who like Olist, they might not be huge enthusiasts pressing refresh every 10 seconds each time Olist releases a new feature but they might be the most important customers, those on which Olist might need to focus the most. Frequency is not really reliable with the data at hand but this group shows a net increase in this statistic, meanwhile their purchases have been more recent and mostly more profitable than Cluster 4
  • Cluster 1 (closest cluster to origin, cluster# changes) represents the people who believe strongly in the services Olist provides, it is very clear that this group contains the most repeating customers, who consequently put on their last order the most recently and who are ready to pay a higher price for Olist's services.

 Even if it is too early to tell, we can speculate and maybe try to apply the "Law of Diffusion of Innovation", Everett Rogers' 1968 theory and reapplied to business models by the now renowned business writer and public speaker Simon Sinek.

 This theory states that :

  • The first 2.5% of Investors are the innovators, believing strongly in the company's ideas and willing to take risks on said idea
  • The next 13.5% are the Early adopters, willing to take risks to try a new model/piece of technology etc.
  • The next 34% are the Early Majority and the following 34% are the Late Majority, investing or using the product/service because it is trending and tested by the first two groups
  • The last 16% are the laggards, who wont willingly make any effort or take any risk for an innovation, newer product or service, this group is not swayed by advertising, word of mouth etc. and represent poor investment in marketing  This Law also theorizes that the tipping point for a new product/service to really take off, it needs the first 15% of these people, group 1 and 2, and that the rest of the majority, where the majority of consumers are, will follow

 We can quantify our clusters and see if we can indeed see what looks like this distribution : group 1 & 2 believing strongly in the idea (cluster 1), group 3 & 4 being the majority (cluster 0 and 2) and group 5 (cluster 3) being the laggards.

In [ ]:
inv = int(input("Investors : "))
maj1 = int(input("Early Majority : "))
maj2 = int(input("Late Majority : "))
lagg = int(input("Laggards : "))
In [ ]:
fig, (ax1) = plt.subplots(
    ncols=1,
    nrows=1,
    figsize=(12, 6),
    dpi=pc_dpi,
)

amounts = df_rfm_mms["cluster_4"].value_counts()
amounts_inv = amounts[inv]
amount_maj_1 = amounts[maj1]
amount_maj_2 = amounts[maj2]
amounts_maj = amount_maj_1 + amount_maj_2
amount_lag = amounts[lagg]
total = len(df_rfm_mms)

cluster_dict = {
        f"Investors-{round((amounts_inv / total) * 100, ndigits=2)}%": amounts_inv,
        f"Majority-{round((amounts_maj / total) * 100, ndigits=2)}%": amounts_maj,
        f"Laggards-{round((amount_lag / total) * 100, ndigits=2)}%": amount_lag
    }

my_colors = ["royalblue", "#003153", "red"]

ax1.bar(
        x=list(cluster_dict.keys()),
        height=list(cluster_dict.values()),
        color=my_colors
    )

###
# Titles/Lables
ax1.set_xlabel("Law of Diffusion Classes (with respective percentages)")
ax1.set_ylabel("Customer per Classes")
fig.suptitle("Expression of Customers theorizing the Law of Diffusion of Innovation")
#
###

fig.tight_layout()
plt.show()

Observation :¶

 The above graph seems to confirm the intuition that Olist's customers follow indeed the Law of Diffusion of Innovation (or consumption here). It is very interesting to see that the cuts between the classes are clear and provide a potential course of action for Olist.  Indeed, according to the Law of Diffusion of Innovation :

  • Our "Investors" (here around 20% based on each run) are already satisfied with Olist, its services and its methods. They are happy with Olist as is and necessitate little to no effort to keep loyal.
  • On the other hand, the "Laggards" (here around 18%) could be interpreted as customers who are either not interested in the services Olist provides, do not wish to change their habits of consumption or are generally disappointed by the services ; this group -cannot- be swayed, or at a high cost.
  • The Majority (Early and Late on the above graph are merged) are the group on which Olist should concentrate its marketing efforts : they are often the followers of the first group. On a purely economical standpoint, this is the most important group to satisfy while keeping group 1 happy.

4.2 : DBSCAN Clustering :¶

 DBSCAN is another algorithm testable on this dataset. We first need to determine the best max intra cluster distance (epsilon) and the best minimal amount of points it takes to make a cluster, this is often dimension + 1 or dimension * 2  Epsilon can be determined using k neighbors. Using a graph to represent the avg distance between a point and its k-neighbors (here 4 : dimension + 1). Zooming in and using the elbow method help us to focus on the best potential epsilon.

In [ ]:
neighbors_matrix = df_rfm_mms[["recency", "frequency", "monetary"]].to_numpy()
nneighbors = NearestNeighbors(n_neighbors=4, n_jobs=-1)  # dataset dim + 1

nneighbors.fit(X=neighbors_matrix)

distances, potential_eps = nneighbors.kneighbors(neighbors_matrix)

distances = np.sort(distances, axis=0)
distances_plot = distances[:,1]
In [ ]:
fig, (ax1) = plt.subplots(
    ncols=1,
    nrows=1,
    figsize=(16, 8),
    dpi=pc_dpi,
)

ax1.plot(distances_plot)

###
# Titles/Lables
ax1.set_xlabel("Object")
ax1.set_ylabel("k distance")
fig.suptitle("Points sorted by distance - Neighbors = 4")
#
###

fig.tight_layout()
plt.show()
In [ ]:
# Zooming up until we can see the "eblow"

fig, (ax1) = plt.subplots(
    ncols=1,
    nrows=1,
    figsize=(16, 8),
    dpi=pc_dpi,
)

ax1.plot(distances_plot)

###
# Titles/Lables
ax1.set_xlabel("Object")
ax1.set_ylabel("k distance")
ax1.set_xlim((96020, 96100))
ax1.set_yticks(np.arange(0, 5, 0.1))
ax1.grid(visible=True, axis="both")
fig.suptitle("Points sorted by distance - Neighbors = 4")
#
###

fig.tight_layout()
plt.show()
In [ ]:
# Looks like .8 is the best candidate

best_dbs = DBSCAN(eps=0.8, min_samples=4, n_jobs=-1)

y_predict = best_dbs.fit_predict(
        df_rfm_mms.drop(
                columns=["cluster", "cluster_4", "cluster_5", "cluster_DBSCAN"], errors="ignore"
            )
    )

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    df_rfm_mms.loc[:, "cluster_DBSCAN"] = y_predict
In [ ]:
labels_dict ={
        "recency": "Recency",
        "frequency": "Frequency of purchase(s)",
        "monetary": "Total Spent on Olist"
    }

marker_style = {
    "size": 5,
    }

fig = px.scatter_3d(
    data_frame=df_rfm_mms, x="recency",
    y="frequency", z="monetary", color="cluster_DBSCAN",
    width=4 * pc_dpi, height=3 * pc_dpi,
    labels=labels_dict,
    )


fig.update_layout(
    margin=dict(l=40, r=40, t=40, b=40),
    title="3D Representation of customers, RFM approach, DBSCAN clustering",
)

fig.update_traces(marker=marker_style)

fig.show()

Observation :¶

 DBSCAN clustering is disappointing here. It identifies outliers rather than smaller groups. It doesn't help to narrow down the potential targets for any campaign. It is unclear whether or not my parameters are not correct or if the algorithm is not helpful in this case.
 A final clustering attempt can be made using Agglomerative Clustering.

4.3 : Agglomerative Clustering¶

 Agglomerative Clustering might help identify pre-determined clusters with the help of the linkage distance, which we can use from Scipy  The objective is to produce at least a similar result to K-Means, if not, it is not worth using and updating at this stage.

4.3.1 : Determining the correct amount of clusters :¶

 Quoting sources : If you want to create flat clusters we can analyze the [...] dendrogram to determine no. of clusters. We first assume that the horizontal lines are extended on both sides, and as such, they would also cross the vertical lines. Now we have to identify the tallest vertical line that does not have any horizontal line crossing through it.

In [ ]:
## Due to Apple m1 chip being a little stubborn, we need to sample the group or the overpriced piece of aluminium is gonna melt
## Taking representatives from classes from k-means, sample size = 2K
sample_inv = df_rfm_mms[df_rfm_mms["cluster_4"] == inv].sample(round((amounts_inv / len(df_rfm_mms)) * 2000))
sample_maj1 = df_rfm_mms[df_rfm_mms["cluster_4"] == maj1].sample(round((amount_maj_1 / len(df_rfm_mms)) * 2000))
sample_maj2 = df_rfm_mms[df_rfm_mms["cluster_4"] == maj2].sample(round((amount_maj_2 / len(df_rfm_mms)) * 2000))
sample_lagg = df_rfm_mms[df_rfm_mms["cluster_4"] == lagg].sample(round((amount_lag / len(df_rfm_mms)) * 2000))

df_rfm_mms_2ksample = pd.concat([sample_inv, sample_maj1, sample_maj2, sample_lagg]).sort_index()

len(df_rfm_mms_2ksample)
Out[ ]:
1999
In [ ]:
fig, (ax1) = plt.subplots(
    ncols=1,
    nrows=1,
    figsize=(8, 4),
    dpi=pc_dpi,
)

linkage_method = hierarchy.linkage(
        df_rfm_mms_2ksample.drop(columns=["cluster", "cluster_4", "cluster_5", "cluster_DBSCAN"], errors="ignore"),
        method ="ward",
        metric="euclidean",
    )
dendrogram_plot = hierarchy.dendrogram(linkage_method, ax=ax1, no_labels=True)

###
# Titles/Lables
#
###

fig.tight_layout()
plt.show()
In [ ]:
# Lets say its 3 ?

agg = AgglomerativeClustering(n_clusters=3)

#Let's try the small one first
df_rfm_mms_2ksample["cluster_agg"] = agg.fit_predict(df_rfm_mms_2ksample.drop(columns=["cluster_4", "cluster_DBSCAN"], errors="ignore"))
In [ ]:
df_rfm_mms_2ksample.head()
Out[ ]:
recency frequency monetary cluster_4 cluster_DBSCAN cluster_agg
4 1.039261 2.048718 0.184608 1 0 0
101 3.807008 0.559272 0.158532 0 0 1
195 4.129577 0.515586 0.052839 0 0 1
202 0.999133 2.130999 0.251565 1 0 0
235 4.001812 0.532047 0.094364 0 0 1
In [ ]:
labels_dict ={
        "recency": "Recency",
        "frequency": "Frequency of purchase(s)",
        "monetary": "Total Spent on Olist"
    }

marker_style = {
    "size": 5,
    }

fig = px.scatter_3d(
    data_frame=df_rfm_mms_2ksample, x="recency",
    y="frequency", z="monetary", color="cluster_agg",
    width=4 * pc_dpi, height=3 * pc_dpi,
    labels=labels_dict,
    )


fig.update_layout(
    margin=dict(l=40, r=40, t=40, b=40),
    title="3D Representation of customers, RFM approach, Agglomerative clustering - 2K sample",
)

fig.update_traces(marker=marker_style)

fig.show()

Observation :¶

This clustering method could be fine, and sure, it works on a 2K sample but an I7 4.3GhZ w/ 32Gb RAM couldn't hack it in an hour for 100K. So it is not in the interest of Olist to use an algorithm that demanding for their 2M unique customers.

Test has been done on Desktop using :

  • i7-97000K
  • 32Gb RAM 2133 MhZ
  • NVidia 3080 4GB integrated RAMDAC
  • Windows 10
  • Visual Studio Code with Python 3.10.5

    Test was stopped at the one hour mark on the 96 000 cx sample because it was simply not feasible, much less scalable

5 : Conclusion and potential improvements :¶

  • K-Means clustering shows a real way of clustering the customers, the theory is backed up by a proven by time "law" : the diffusion of innovation, this is only a primary result but it narrows down considerably Olist's targets.
  • Further improvements could be, and will be made, using and testing metrics like :
    • Geo location : how the customers are clustered and where a campaign can have the maximum impact
    • Delta Dist between buyers and sellers : Do customers consume close to home ? Does this matter ?
    • Reviews : Are consumers driven to a product because of it's previous ratings, and do potential loyal customer leave reviews more often ?
    • Categories : Are some categories more successful than others ? Are giants like Amazon and Ebay swallowing a given category while leaving others to local businesses, favored by Olist ?

Not all of this points might be relevant but including them in the model could make a difference.


We can, before export, take a look at how much each group spend, as the sum total of the group, and the average by member of said group


Export :¶

Will be exported :

  • customers unique ids
  • the list of their order(s)
  • the RFM data
  • the initial clusters found by k-means --> Unexpectedly positive results could mean improvements on further models
  • Cluster names, so that we will know whos who
In [ ]:
# Export :

df_rfm_mms.head()
Out[ ]:
recency frequency monetary cluster_4 cluster_DBSCAN
0 6.716759 0.316991 0.107486 2 0
1 3.595325 0.592200 0.245520 3 0
2 1.954565 1.089323 0.115434 3 0
3 2.821503 0.754616 0.126829 3 0
4 1.039261 2.048718 0.184608 1 0
In [ ]:
df_rfm.head()
Out[ ]:
customer_uid order_id_list most_ancient_order_dt most_recent_order_dt recency frequency monetary num_orders
0 1 [88493] 2017-05-16 15:05:35 2017-05-16 15:05:35 44850283.0 0.053939 146.87 1
1 2 [90419] 2018-01-12 20:48:24 2018-01-12 20:48:24 24007314.0 0.100769 335.48 1
2 3 [22558] 2018-05-19 16:07:45 2018-05-19 16:07:45 13051353.0 0.185360 157.73 1
3 4 [32181] 2018-03-13 16:06:38 2018-03-13 16:06:38 18840220.0 0.128406 173.3 1
4 5 [69903] 2018-07-29 09:51:30 2018-07-29 09:51:30 6939528.0 0.348612 252.25 1
In [ ]:
# one exception, hence try / except

def get_kmeans(row):
    try:
        return int(df_rfm_mms.at[row["customer_uid"], "cluster_4"])
    except KeyError:
        pass


def get_dbscan(row):
    try:
        return df_rfm_mms.at[row["customer_uid"], "cluster_DBSCAN"]
    except KeyError:
        pass
In [ ]:
df_rfm["cluster_kmeans_4"] = df_rfm.apply(get_kmeans, axis=1)
df_rfm["cluster_DBSCAN"] = df_rfm.apply(get_dbscan, axis=1)
In [ ]:
df_rfm.dropna(subset=["cluster_kmeans_4", "cluster_DBSCAN"], inplace=True)
In [ ]:
df_rfm["cluster_kmeans_4"] = df_rfm["cluster_kmeans_4"].astype(int)
df_rfm["cluster_DBSCAN"] = df_rfm["cluster_DBSCAN"].astype(int)
In [ ]:
def get_name(grp_nbr: int)-> str:
    if grp_nbr == inv:
        return "Investors"
    elif grp_nbr == maj1:
        return "Early Majority"
    elif grp_nbr == maj2:
        return "Late Majority"
    elif grp_nbr == lagg:
        return "Laggards"
    else:
        raise Exception("group number not defined")

def addlabels_1(x, y):
    for i in range(len(x)):
        plt.text(i, y[i], f"{round(y[i], ndigits=2)} R$", ha="center", bbox=dict(facecolor="white", alpha=1))

def addlabels(x, y):
    for i in range(len(x)):
        plt.text(i, y[i], f"{y[i]} R$", ha="center", bbox=dict(facecolor="white", alpha=1))
In [ ]:
# Total per cluster :
k_means_clusters = df_rfm["cluster_kmeans_4"].unique().tolist()

Group_money = namedtuple("Group_money", field_names=["group_name", "group_total", "avg_p_capita"])

group_spending = []

for cluster in k_means_clusters:
    total_spent = df_rfm[df_rfm["cluster_kmeans_4"] == cluster]["monetary"].values.sum()
    avg_pc = round(np.average(df_rfm[df_rfm["cluster_kmeans_4"] == cluster]["monetary"]), ndigits=2)
    group_name = get_name(cluster)
    group_spending.append(Group_money(group_name=group_name, group_total=total_spent, avg_p_capita=avg_pc))
In [ ]:
fig, (ax1) = plt.subplots(
    ncols=1,
    nrows=1,
    figsize=(12, 8),
    dpi=pc_dpi,
)

ax1.bar(
    x=[group.group_name for group in group_spending],
    height=[group.group_total for group in group_spending],
    width=.8, color=["royalblue", "#003153", "navy", "red"], edgecolor="black",
    )

###
# Titles/Lables
addlabels_1(x=[group.group_name for group in group_spending], y=[group.group_total for group in group_spending])
ax1.set(yticklabels=[])
ax1.tick_params(left=False)
fig.suptitle("Total spent by clusters determined by k-means w/ k=4")
#
###

fig.tight_layout()
plt.show()

Observations :¶

 As expected, the majority is the group spending the most, as they represent the two largest clusters, it is to be expected. The Early Majority seems to be spending more than the late majority.
 Surprisingly, the most loyal customers seem to spend as much as the laggards, with clusters of approximately the same size that is surprising.
 Plotting the avg spending per member might help clarify these observations.

In [ ]:
fig, (ax1) = plt.subplots(
    ncols=1,
    nrows=1,
    figsize=(12, 8),
    dpi=pc_dpi,
)

ax1.bar(
    x=[group.group_name for group in group_spending],
    height=[group.avg_p_capita for group in group_spending],
    width=.8, color=["royalblue", "#003153", "navy", "red"], edgecolor="black",
    )

###
# Titles/Lables
addlabels(x=[group.group_name for group in group_spending], y=[group.avg_p_capita for group in group_spending])
ax1.set(yticklabels=[])
ax1.tick_params(left=False)
fig.suptitle("Spending per capita by clusters determined by k-means w/ k=4")
#
###

fig.tight_layout()
plt.show()

 Ok so thats surprising. Each group has an average spending per capita more or less equal, stdeviation of only around 5R$. The determining factor in clustering was not monetary surely, mostly frequency and recency. We will build a more robust model to identify that phenomenon.

In [ ]:
def get_k_cluster_names(row):
    cluster = row["cluster_kmeans_4"]
    if cluster == inv:
        return "investors"
    elif cluster == maj1:
        return "early_majority"
    elif cluster == maj2:
        return "late_majority"
    elif cluster == lagg:
        return "laggards"
In [ ]:
df_rfm["k_cluster_name"] = df_rfm.apply(get_k_cluster_names, axis=1)
In [ ]:
df_rfm.head()
Out[ ]:
customer_uid order_id_list most_ancient_order_dt most_recent_order_dt recency frequency monetary num_orders cluster_kmeans_4 cluster_DBSCAN k_cluster_name
0 1 [88493] 2017-05-16 15:05:35 2017-05-16 15:05:35 44850283.0 0.053939 146.87 1 3 0 early_majority
1 2 [90419] 2018-01-12 20:48:24 2018-01-12 20:48:24 24007314.0 0.100769 335.48 1 3 0 early_majority
2 3 [22558] 2018-05-19 16:07:45 2018-05-19 16:07:45 13051353.0 0.185360 157.73 1 3 0 early_majority
3 4 [32181] 2018-03-13 16:06:38 2018-03-13 16:06:38 18840220.0 0.128406 173.3 1 1 0 investors
4 5 [69903] 2018-07-29 09:51:30 2018-07-29 09:51:30 6939528.0 0.348612 252.25 1 0 0 late_majority
In [ ]:
df_rfm.to_csv(path_or_buf="../data/optimized/cx_rfm.csv", index=False)
df_rfm.to_pickle(path="../pickles/cx_rfm.pkl")